{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import os\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from tqdm import tqdm\n",
    "import augmentation\n",
    "import scipy\n",
    "import librosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2240, 560)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wavs = glob.glob('data/*.wav')\n",
    "train_X, test_X = train_test_split(wavs, test_size = 0.2)\n",
    "len(train_X), len(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('augment'):\n",
    "    os.makedirs('augment')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in tqdm(range(len(train_X))):\n",
    "    wav = train_X[i]\n",
    "    try:\n",
    "        root, ext = os.path.splitext(wav)\n",
    "        root = root.split('/')[1]\n",
    "        root = '%s/%s' % ('augment', root)\n",
    "        sample_rate, samples = scipy.io.wavfile.read(wav)\n",
    "        aug = augmentation.change_pitch_speech(samples)\n",
    "        librosa.output.write_wav(\n",
    "            '%s-1%s' % (root, ext),\n",
    "            aug.astype('float32'),\n",
    "            sample_rate,\n",
    "            norm = True,\n",
    "        )\n",
    "        aug = augmentation.change_amplitude(samples)\n",
    "        librosa.output.write_wav(\n",
    "            '%s-2%s' % (root, ext),\n",
    "            aug.astype('float32'),\n",
    "            sample_rate,\n",
    "            norm = True,\n",
    "        )\n",
    "\n",
    "        aug = augmentation.add_noise(samples)\n",
    "        librosa.output.write_wav(\n",
    "            '%s-3%s' % (root, ext),\n",
    "            aug.astype('float32'),\n",
    "            sample_rate,\n",
    "            norm = True,\n",
    "        )\n",
    "\n",
    "        aug = augmentation.add_hpss(samples)\n",
    "        librosa.output.write_wav(\n",
    "            '%s-4%s' % (root, ext),\n",
    "            aug.astype('float32'),\n",
    "            sample_rate,\n",
    "            norm = True,\n",
    "        )\n",
    "\n",
    "        aug = augmentation.strech(samples)\n",
    "        librosa.output.write_wav(\n",
    "            '%s-5%s' % (root, ext),\n",
    "            aug.astype('float32'),\n",
    "            sample_rate,\n",
    "            norm = True,\n",
    "        )\n",
    "\n",
    "        aug = augmentation.random_augmentation(samples)\n",
    "        librosa.output.write_wav(\n",
    "            '%s-6%s' % (root, ext),\n",
    "            aug.astype('float32'),\n",
    "            sample_rate,\n",
    "            norm = True,\n",
    "        )\n",
    "    except:\n",
    "        pass\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import soundfile\n",
    "\n",
    "sampling_rate = 22050\n",
    "n_fft = 2048\n",
    "frame_shift = 0.0125\n",
    "frame_length = 0.05\n",
    "hop_length = int(sampling_rate * frame_shift)\n",
    "win_length = int(sampling_rate * frame_length)\n",
    "n_mels = 80\n",
    "reduction_factor = 5\n",
    "\n",
    "def compute_spectrogram_feature(\n",
    "    samples,\n",
    "    sample_rate = 16000,\n",
    "    stride_ms = 10.0,\n",
    "    window_ms = 20.0,\n",
    "    max_freq = None,\n",
    "    eps = 1e-14,\n",
    "):\n",
    "    if max_freq is None:\n",
    "        max_freq = sample_rate / 2\n",
    "    if max_freq > sample_rate / 2:\n",
    "        raise ValueError(\n",
    "            'max_freq must not be greater than half of sample rate.'\n",
    "        )\n",
    "\n",
    "    if stride_ms > window_ms:\n",
    "        raise ValueError('Stride size must not be greater than window size.')\n",
    "\n",
    "    stride_size = int(0.001 * sample_rate * stride_ms)\n",
    "    window_size = int(0.001 * sample_rate * window_ms)\n",
    "\n",
    "    # Extract strided windows\n",
    "    truncate_size = (len(samples) - window_size) % stride_size\n",
    "    samples = samples[: len(samples) - truncate_size]\n",
    "    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)\n",
    "    nstrides = (samples.strides[0], samples.strides[0] * stride_size)\n",
    "    windows = np.lib.stride_tricks.as_strided(\n",
    "        samples, shape = nshape, strides = nstrides\n",
    "    )\n",
    "    assert np.all(\n",
    "        windows[:, 1] == samples[stride_size : (stride_size + window_size)]\n",
    "    )\n",
    "\n",
    "    # Window weighting, squared Fast Fourier Transform (fft), scaling\n",
    "    weighting = np.hanning(window_size)[:, None]\n",
    "    fft = np.fft.rfft(windows * weighting, axis = 0)\n",
    "    fft = np.absolute(fft)\n",
    "    fft = fft ** 2\n",
    "    scale = np.sum(weighting ** 2) * sample_rate\n",
    "    fft[1:-1, :] *= 2.0 / scale\n",
    "    fft[(0, -1), :] /= scale\n",
    "    # Prepare fft frequency list\n",
    "    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])\n",
    "\n",
    "    # Compute spectrogram feature\n",
    "    ind = np.where(freqs <= max_freq)[0][-1] + 1\n",
    "    specgram = np.log(fft[:ind, :] + eps)\n",
    "    return np.transpose(specgram, (1, 0))\n",
    "\n",
    "\n",
    "def get_spectrogram(fpath):\n",
    "    y, sr = librosa.load(fpath, sr = sampling_rate)\n",
    "    D = librosa.stft(\n",
    "        y = y, n_fft = n_fft, hop_length = hop_length, win_length = win_length\n",
    "    )\n",
    "    magnitude = np.abs(D)\n",
    "    power = magnitude ** 2\n",
    "    S = librosa.feature.melspectrogram(S = power, n_mels = n_mels)\n",
    "    return np.transpose(S.astype(np.float32))\n",
    "\n",
    "\n",
    "def reduce_frames(x, r_factor):\n",
    "    T, C = x.shape\n",
    "    num_paddings = reduction_factor - (T % r_factor) if T % r_factor != 0 else 0\n",
    "    padded = np.pad(x, [[0, num_paddings], [0, 0]], 'constant')\n",
    "    return np.reshape(padded, (-1, C * r_factor))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33, 400)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spectrogram = get_spectrogram(wavs[0])\n",
    "spectrogram = reduce_frames(spectrogram, reduction_factor)\n",
    "spectrogram.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "augments = glob.glob('augment/*.wav')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-19.36971185, -16.57376858, -18.71725863, ..., -23.41501521,\n",
       "        -23.68784775, -24.03949327],\n",
       "       [-15.61767618, -16.41841276, -19.80099433, ..., -20.95772327,\n",
       "        -21.39394458, -24.18905672],\n",
       "       [-15.02933384, -15.46920427, -19.5143906 , ..., -20.94875192,\n",
       "        -21.02911394, -21.28899505],\n",
       "       ...,\n",
       "       [-32.2361913 , -32.2361913 , -32.2361913 , ..., -32.2361913 ,\n",
       "        -32.2361913 , -32.2361913 ],\n",
       "       [-32.2361913 , -32.2361913 , -32.2361913 , ..., -32.2361913 ,\n",
       "        -32.2361913 , -32.2361913 ],\n",
       "       [-32.2361913 , -32.2361913 , -32.2361913 , ..., -32.2361913 ,\n",
       "        -32.2361913 , -32.2361913 ]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data, _ = soundfile.read(augments[1])\n",
    "spectrogram = compute_spectrogram_feature(data)\n",
    "spectrogram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('spectrogram-train'):\n",
    "    os.mkdir('spectrogram-train')\n",
    "\n",
    "if not os.path.exists('spectrogram-test'):\n",
    "    os.mkdir('spectrogram-test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2240/2240 [02:26<00:00, 16.25it/s]\n",
      "100%|██████████| 13642/13642 [15:26<00:00, 13.94it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for i in tqdm(range(len(train_X))):\n",
    "    i = train_X[i]\n",
    "    loc = 'spectrogram-train/%s.npy'%(os.path.basename(i).split('.')[0])\n",
    "    \n",
    "    spectrogram = get_spectrogram(i)\n",
    "    spectrogram = reduce_frames(spectrogram, reduction_factor)\n",
    "    np.save(loc, spectrogram)\n",
    "    \n",
    "for i in tqdm(range(len(augments))):\n",
    "    i = augments[i]\n",
    "    loc = 'spectrogram-train/%s.npy'%(os.path.basename(i).split('.')[0])\n",
    "    spectrogram = get_spectrogram(i)\n",
    "    spectrogram = reduce_frames(spectrogram, reduction_factor)\n",
    "    np.save(loc, spectrogram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 560/560 [00:36<00:00, 15.41it/s]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(range(len(test_X))):\n",
    "    i = test_X[i]\n",
    "    loc = 'spectrogram-test/%s.npy'%(os.path.basename(i).split('.')[0])\n",
    "    spectrogram = get_spectrogram(i)\n",
    "    spectrogram = reduce_frames(spectrogram, reduction_factor)\n",
    "    np.save(loc, spectrogram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
